###-R script for the evaluation of multil-level modeling with poststratification using Eurobaromter data
### Question 8. Gender euqality for political leaders
### This version: 7 december 2014. Author: Dimiter Toshkov

# Required libraries and functions ------------------------------------------------------
library('memisc') #Import SPSS files
library(plyr) #Data manipulation
library(sampling) #Draw samples
library(car) #Recoding N.B.Needs to be loaded after 'memisc'. Otherwise the 'recode' function gets mixed up
library("SmarterPoland") #access to EUROSTAT data
library(arm) #multi-level data analysis
library(Metrics) #for calculating errors
library(pscl) #for calculating pseudo r-squared measures for glm
library(sorvi) #string strips
library(Kendall) #Kendall's tau
library(ggplot2) #for plots
library(scales) #for formatting percent axes
#function for sums ignoring NAs
sum.na<-function (x) {sum(x, na.rm=T)}
#function for calculating interclass correlation as in Buttice and Highton
icc.function<-function(x,y){  
  temp<-VarCorr(lmer(x~(1|y)))
  attributes(temp$y)$stddev^2/(attributes(temp)$sc^2+attributes(temp$y)$stddev^2)
}

# Part I. Getting the EUROBAROMETER public opinion data ------------------
#Load the data
data <- as.data.set(spss.system.file("E:/Project MLM and PS/ZA4233_v1-1-0.sav"))
#Get the geographical identifier
data$geoID<-as.factor(substring(as.character(data$v7),1,2))
#Subset the data for the needed countries
data1<-subset(data, geoID=='AT' | geoID=='BG'| geoID=='CY'
              | geoID=='CZ'| geoID=='DE'| geoID=='DK'| geoID=='EE'| geoID=='GR'
              | geoID=='ES'| geoID=='FI'| geoID=='FR'| geoID=='HU'
              | geoID=='IE'| geoID=='IT'| geoID=='LT'| geoID=='LU'
              | geoID=='LV'| geoID=='NL'| geoID=='PL'| geoID=='PT'
              | geoID=='RO'| geoID=='SI'| geoID=='SK'| geoID=='GB')

#outcome variable: gender equality
data1$trust<-ifelse(data1$v324=="Strongly agree" | data1$v324=="Tend to agree" , 1,0)
#Individual-level predictors: sex
data1$sex<-recode(as.factor(data1$v670), '"Female"="F";"Male"="M"')
#Individual-level predictors: age, recode to match the census data (see below)
data1$age<-recode(as.factor(data1$v673), ('"15 - 24 years"="Y15-24"; "25 - 34 years"="Y25-34";c("35 - 44 years", "45 - 54 years")="Y35-54";c("55 - 64 years","65 years and older")="Y55-74";else=NA'))
#Individual-level predictors: occupation, recode to match the census data (see below)
data1$work<-recode(as.factor(data1$v674), ('"Unemployed, temporarily not working"="UNE"; 
                                           c("Responsible for ordinary shopping, etc.","Student","Retired, unable to work")="INAC";
                                           else="EMP"'))
#Individual-level predictors: education, recode to match the census data (see below)
data1$edu<-recode(as.factor(data1$v669), (' "No full-time education"="NED";
                                            NA="UNK";
                                            "Up to 14 years"="ED1";
                                            c("15 years", "16 years","17 years")="ED2";
                                            c("18 years", "19 years")="ED3"; 
                                            c("20 years", "21 years")="ED4";
                                            else="ED5_6"                                            
                                                  '))
#Weighting factor
data1$weight<-ifelse(data1$geoID=='DE', data1$weight<-data1$v12, ifelse(data1$geoID=='GB',data1$weight<-data1$v10,data1$weight<-data1$v8))

#Subset the data
data2<-data1[,c('trust','geoID','sex','edu','work', 'age', "weight")]

#Add the additional country-level predictors 
allvars<-read.table("E:/allvars.txt")
data2<-merge(data2,allvars, by.x="geoID", by.y="geo")

#Order the data
data2<-data2[order(data2$geoID),]
#Make sure everything is a factor
data2$geoID<-as.factor(data2$geoID)
data2$trust<-as.factor(data2$trust)
data2$sex<-as.factor(data2$sex)
data2$edu<-as.factor(data2$edu)
data2$work<-as.factor(data2$work)
data2$age<-as.factor(data2$age)

# Part II. Get the real country means and describe variation ------------------
d_real<-ddply(data2,"geoID",function(X) data.frame(wm_real=weighted.mean(as.numeric(as.character(X$trust)), X$weight),
                                                   m_real=mean(as.numeric(as.character(X$trust)))))
d_real$n<-aggregate(data2$geoID, list(data2$geoID), FUN = length)$x
d_real$error<-sqrt((d_real$m_real-d_real$m_real^2)/d_real$n)
d_real$min<-d_real$m_real-1.96*d_real$error
d_real$max<-d_real$m_real+1.96*d_real$error
d_real$plusone<-d_real$m_real+1*d_real$error
d_real$minusone<-d_real$m_real-1*d_real$error
d_real$werror<-sqrt((d_real$wm_real-d_real$wm_real^2)/d_real$n)
d_real$wmin<-d_real$wm_real-1.96*d_real$werror
d_real$wmax<-d_real$wm_real+1.96*d_real$werror
d_real$wplusone<-d_real$wm_real+1*d_real$werror
d_real$wminusone<-d_real$wm_real-1*d_real$werror
summary(d_real$m_real)

#state-level standard deviations
state.sd_real<-sd(d_real$m_real)
round(state.sd_real, digits=2)
state.wsd_real<-sd(d_real$wm_real)

#state-level r-squared from the country-level model
states<-merge(d_real, allvars, by.x="geoID", by.y='geo')
summary(lm(m_real~reg+corupt, data=states)) #Find a good linear model
adj.r2<-summary(lm(m_real~reg+corupt, data=states))[9] #get the adj. r-squared from the model
round(adj.r2[[1]], digits=2)
#individual-level model only
summary(t1<-glm(trust~sex+edu+work+age, data=data2, family='binomial'))
mcfr2<-pR2(t1)[4] #McFadden's pseudo-R squared
round(mcfr2, digits=2)

#across vs. within variation
icc<-icc.function(data2$trust,data2$geoID) #Note that a linear version of the model is used
round(icc, digits=2)
# Part III. Getting the CENSUS data from EUROSTAT ------------------
#read the file
popul<-read.table('E:/popul.txt')
popul<-merge(popul, allvars, by.x="geoID", by.y="geo") #add the country level predictors

# Part IV. Get the sample weights from the Eurobarometer data ------------------------------------------------
#Get the country weights
population<-read.table("E:/population2001.txt")

# Part IV. Run the models on the full data and get the predictions------------------------------------------------
#Full model - mrp
mega_m<-glmer(trust~(1|geoID)+(1|age)+(1|sex)+(1|edu)+(1|work)+reg+corupt, family=binomial(link="logit"), data=data2)
#Country RE with country predictors - mrp0
mega_m0<-glmer(trust~(1|geoID)+reg+corupt, family=binomial(link="logit"), data=data2)
#Country RE - re0
mega_mre0<-glmer(trust~(1|geoID), family=binomial(link="logit"), data=data2)
#Individual REs - ie0
mega_ie0<-glmer(trust~+(1|age)+(1|sex)+(1|edu)+(1|work), family=binomial(link="logit"), data=data2)

#Obtain the predictions for the full model
popul$mega_cellpred <- invlogit(fixef(mega_m)["(Intercept)"]
                           +ranef(mega_m)$geoID[as.character(popul$geoID),1]
                           +ranef(mega_m)$age[as.character(popul$age),1]
                           +ranef(mega_m)$sex[as.character(popul$sex),1]
                           +ranef(mega_m)$edu[as.character(popul$edu),1]
                           +ranef(mega_m)$work[as.character(popul$work),1]
                           + fixef(mega_m)["reg"]*as.numeric(as.character(popul$reg.x))
                           + fixef(mega_m)["corupt"]*as.numeric(as.character(popul$corupt))
)
#Calculate the weighted prediction          
popul$mega_wpred<-popul$share*popul$mega_cellpred
#Get the state predictions
mega_statepred <- as.vector(tapply(popul$mega_wpred,popul$geoID,sum.na))


#Obtain the predictions for the individual-level random effects model only
popul$mega_ie0_cellpred <- invlogit(fixef(mega_ie0)["(Intercept)"]
                                +ranef(mega_ie0)$age[as.character(popul$age),1]
                                +ranef(mega_ie0)$sex[as.character(popul$sex),1]
                                +ranef(mega_ie0)$edu[as.character(popul$edu),1]
                                +ranef(mega_ie0)$work[as.character(popul$work),1]
                                
)
#Calculate the weighted prediction          
popul$mega_ie0_wpred<-popul$share*popul$mega_ie0_cellpred
#Get the state predictions
mega_ie0_statepred <- as.vector(tapply(popul$mega_ie0_wpred,popul$geoID,sum.na))

#Obtain the predictions for the model only country RE and FE only
popul$mega_m0_cellpred <- invlogit(fixef(mega_m0)["(Intercept)"]
                                   + ranef(mega_m0)$geoID[as.character(popul$geoID),1]
                                   + fixef(mega_m0)["reg"]*as.numeric(as.character(popul$reg.x))
                                   + fixef(mega_m0)["corupt"]*as.numeric(as.character(popul$corupt))
)
#Calculate the weighted prediction          
popul$mega_m0_wpred<-popul$share*popul$mega_m0_cellpred
#Get the state predictions
mega_m0_statepred <- as.vector(tapply(popul$mega_m0_wpred,popul$geoID,sum.na))

#Obtain the predictions for the model only country geo RE only
popul$mega_re0_cellpred <- invlogit(fixef(mega_mre0)["(Intercept)"]
                                   + ranef(mega_mre0)$geoID[as.character(popul$geoID),1]
                                   
)
#Calculate the weighted prediction          
popul$mega_re0_wpred<-popul$share*popul$mega_re0_cellpred
#Get the state predictions
mega_re0_statepred <- as.vector(tapply(popul$mega_re0_wpred,popul$geoID,sum.na))

mega_temp<-as.data.frame(matrix(NA,24,4))
colnames(mega_temp)<-c("full","ie","geoplus","geo")
for (i in 1:24) {
ifelse(mega_statepred[i]>d_real$plusone[i] | mega_statepred[i]<d_real$minusone[i], mega_temp[i,1]<-0 ,mega_temp[i,1]<-1)
ifelse(mega_ie0_statepred[i]>d_real$plusone[i] | mega_ie0_statepred[i]<d_real$minusone[i], mega_temp[i,2]<-0 ,mega_temp[i,2]<-1)
ifelse(mega_m0_statepred[i]>d_real$plusone[i] | mega_m0_statepred[i]<d_real$minusone[i], mega_temp[i,3]<-0 ,mega_temp[i,3]<-1)
ifelse(mega_re0_statepred[i]>d_real$plusone[i] | mega_re0_statepred[i]<d_real$minusone[i], mega_temp[i,4]<-0 ,mega_temp[i,4]<-1)
}

cor(d_real$m_real, mega_statepred)
cor(d_real$wm_real, mega_ie0_statepred)
cor(d_real$wm_real, mega_m0_statepred)
cor(d_real$wm_real, mega_re0_statepred)
display(mega_m)
# Part V. Prepare the output storage matrixes ------------------------------------------------
#0. Set the number of simulations
n=100 
#1. Naive disaggregation
naives<-as.data.frame(matrix(NA,24,n+1))
naives[,1]<-population$geo
#2. Model with random effect for geoID only
re0<-as.data.frame(matrix(NA,24,n+1))
re0[,1]<-population$geo
#3. Model with individual-level random effects only
ie0<-as.data.frame(matrix(NA,24,n+1))
ie0[,1]<-population$geo
#4. Model with random effect for geoID and country-level predictors
mrp0<-as.data.frame(matrix(NA,24,n+1))
mrp0[,1]<-population$geo
#5. Full model with random effect for geoID, country-level predictors and individual-level random effects
mrp<-as.data.frame(matrix(NA,24,n+1))
mrp[,1]<-population$geo

#6. Collect model summaries 
effects<-as.data.frame(matrix(NA,n+1, 2+5+4,))
colnames(effects)<-c("reg","corupt",
                     "geoID","edu","age","work","sex",
                     "deviance.full", "deviance.ie", "deviance.geo", "deviance.geoplus")

#7. Check coverage
coverage<-as.data.frame(matrix(NA,n,5))
colnames(coverage)<-c("naive","re","geo", "geoplus", "full")
cov.state<-as.data.frame(matrix(0,24,5+1))
colnames(cov.state)<-c("naive","re","geo", "geoplus", "full", "GeoID")
cov.state$GeoID<-population$geo

#8. Collect correlations
results<-as.data.frame(matrix(NA,4*5,3))
colnames(results)<-c("mean","median","stdev")
rownames(results)<-c("rho.naive","rho.ie", "rho.geo","rho.geoplus","rho.full",
                     "tau.naive","tau.ie","tau.geo","tau.geoplus","tau.full",
                     "mae.naive","mae.ie", "mae.geo","mae.geoplus","mae.full",
                     "rmse.naive","rmse.ie","rmse.geo","rmse.geoplus","rmse.full")

#9. Collect improvements
impro<-as.data.frame(matrix(NA,n,5))
colnames(impro)<-c("naive","ie","geo", "geoplus", "full")

# Part VI. Execute the simulations ------------------------------------------------
#just in case sort again
data2<-data2[order(data2$geoID),]
#start the loop
for (i in 1:n){
s<-strata(data2, c('geoID'), method='srswor', size=round(population$prop*1500, digits=0))
sample.data<-getdata(data2,s)
#Disaggregated (naive) country means
d_naive<-ddply(sample.data,"geoID",function(X) data.frame(m_naive=mean(as.numeric(as.character(X$trust)))))
naives[,i+1]<-d_naive$m_naive
# Fit the multi-level regression models 
m<-glmer(trust~(1|geoID)+(1|age)+(1|sex)+(1|edu)+(1|work)+reg+corupt, family=binomial(link="logit"), data=sample.data)
m0<-glmer(trust~(1|geoID)+reg+corupt, family=binomial(link="logit"), data=sample.data)
mre0<-glmer(trust~(1|geoID), family=binomial(link="logit"), data=sample.data)
mie0<-glmer(trust~(1|age)+(1|sex)+(1|edu)+(1|work), family=binomial(link="logit"), data=sample.data)

#record the model estimates
effects[i,1]<-fixef(m)[2]
effects[i,2]<-fixef(m)[3]
effects[i,3]<-attributes(VarCorr(m)[[1]])$stddev
effects[i,4]<-attributes(VarCorr(m)[[2]])$stddev
effects[i,5]<-attributes(VarCorr(m)[[3]])$stddev
effects[i,6]<-attributes(VarCorr(m)[[4]])$stddev
effects[i,7]<-attributes(VarCorr(m)[[5]])$stddev
effects[i,8]<-summary(m)$dev[[1]][8]
effects[i,9]<-summary(mie0)$dev[[1]][8]
effects[i,10]<-summary(mre0)$dev[[1]][8]
effects[i,11]<-summary(m0)$dev[[1]][8]

#Obtain the predictions for the full models
popul$cellpred <- invlogit(fixef(m)["(Intercept)"]
                     +ranef(m)$geoID[as.character(popul$geoID),1]
                     +ranef(m)$age[as.character(popul$age),1]
                     +ranef(m)$sex[as.character(popul$sex),1]
                     +ranef(m)$edu[as.character(popul$edu),1]
                     +ranef(m)$work[as.character(popul$work),1]
                     + fixef(m)["reg"]*as.numeric(as.character(popul$reg.x))
                     + fixef(m)["corupt"]*as.numeric(as.character(popul$corupt))
                     )
#Calculate the weighted prediction          
popul$wpred<-popul$share*popul$cellpred
#Get the state predictions
statepred <- as.vector(tapply(popul$wpred,popul$geoID,sum.na))
mrp[,i+1]<-statepred

#Obtain the predictions for the model with country level predictors only
popul$cellpred0 <- invlogit(fixef(m0)["(Intercept)"]
                           +ranef(m0)$geoID[as.character(popul$geoID),1]
                            + fixef(m)["reg"]*as.numeric(as.character(popul$reg.x))
                            + fixef(m)["corupt"]*as.numeric(as.character(popul$corupt)))
#Calculate the weighted prediction          
popul$wpred0<-popul$share*popul$cellpred0
#Get the state predictions
statepred0 <- as.vector(tapply(popul$wpred0,popul$geoID,sum.na))
mrp0[,i+1]<-statepred0

#Obtain the predictions for the model with country level random effects only
popul$cellpredre0 <- invlogit(fixef(mre0)["(Intercept)"]
                            +ranef(mre0)$geoID[as.character(popul$geoID),1]
                            )
#Calculate the weighted prediction          
popul$wpredre0<-popul$share*popul$cellpredre0
#Get the state predictions
statepredre0 <- as.vector(tapply(popul$wpredre0,popul$geoID,sum.na))
re0[,i+1]<-statepredre0

#Obtain the predictions for the individual-level random effects only model
popul$cellpredie <- invlogit(fixef(mie0)["(Intercept)"]
                           +ranef(mie0)$age[as.character(popul$age),1]
                           +ranef(mie0)$sex[as.character(popul$sex),1]
                           +ranef(mie0)$edu[as.character(popul$edu),1]
                           +ranef(mie0)$work[as.character(popul$work),1]
                           )
#Calculate the weighted prediction          
popul$wpredie<-popul$share*popul$cellpredie
#Get the state predictions
statepredie <- as.vector(tapply(popul$wpredie,popul$geoID,sum.na))
ie0[,i+1]<-statepredie

}

# Part VII. Evaluate the predictions ------------------------------------------------
#correlations
results["rho.naive",]<-c(mean(array(cor(d_real$m_real,naives[,-1]))),
                        median(array(cor(d_real$m_real,naives[,-1]))),
                        sd(array(cor(d_real$m_real,naives[,-1]))))
results["rho.ie",]<-c(mean(array(cor(d_real$m_real,ie0[,-1]))),
                       median(array(cor(d_real$m_real,ie0[,-1]))),
                       sd(array(cor(d_real$m_real,ie0[,-1]))))
results["rho.geo",]<-c(mean(array(cor(d_real$m_real,re0[,-1]))),
                         median(array(cor(d_real$m_real,re0[,-1]))),
                         sd(array(cor(d_real$m_real,re0[,-1]))))
results["rho.geoplus",]<-c(mean(array(cor(d_real$m_real,mrp0[,-1]))),
                         median(array(cor(d_real$m_real,mrp0[,-1]))),
                         sd(array(cor(d_real$m_real,mrp0[,-1]))))
results["rho.full",]<-c(mean(array(cor(d_real$m_real,mrp[,-1]))),
                         median(array(cor(d_real$m_real,mrp[,-1]))),
                         sd(array(cor(d_real$m_real,mrp[,-1]))))

#Kendal's tau
t1<-rep(NA,100)
t2<-rep(NA,100)
t3<-rep(NA,100)
t4<-rep(NA,100)
t5<-rep(NA,100)
for (i in 2:101) (t1[i-1]<-Kendall (d_real$m_real,naives[,i])$tau[1]) #sl for different measure
for (i in 2:101) (t2[i-1]<-Kendall (d_real$m_real,ie0[,i])$tau[1])
for (i in 2:101) (t3[i-1]<-Kendall (d_real$m_real,re0[,i])$tau[1])
for (i in 2:101) (t4[i-1]<-Kendall (d_real$m_real,mrp0[,i])$tau[1])
for (i in 2:101) (t5[i-1]<-Kendall (d_real$m_real,mrp[,i])$tau[1])
results["tau.naive",]<-c(mean(t1),median(t1),sd(t1))
results["tau.ie",]<-c(mean(t2),median(t2),sd(t2))
results["tau.geo",]<-c(mean(t3),median(t3),sd(t3))
results["tau.geoplus",]<-c(mean(t4),median(t4),sd(t4))
results["tau.full",]<-c(mean(t5),median(t5),sd(t5))

#RMSE
results["rmse.naive",]<-c(mean(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))),
                          median(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))),
                          sd(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.ie",]<-c(mean(array(apply(ie0[,-1], 2 , "rmse" , d_real$m_real ))),
                        median(array(apply(ie0[,-1], 2 , "rmse" , d_real$m_real ))),
                        sd(array(apply(ie0[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.geo",]<-c(mean(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))),
                        median(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))),
                        sd(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.geoplus",]<-c(mean(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))),
                            median(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))),
                            sd(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.full",]<-c(mean(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))),
                         median(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))),
                         sd(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))))
#MAE
results["mae.naive",]<-c(mean(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))),
                          median(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))),
                          sd(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.ie",]<-c(mean(array(apply(ie0[,-1], 2 , "mae" , d_real$m_real ))),
                       median(array(apply(ie0[,-1], 2 , "mae" , d_real$m_real ))),
                       sd(array(apply(ie0[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.geo",]<-c(mean(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))),
                        median(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))),
                        sd(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.geoplus",]<-c(mean(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))),
                            median(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))),
                            sd(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.full",]<-c(mean(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))),
                         median(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))),
                         sd(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))))
results

##Check how many of the MRP-estimates are within the 95 confidence intervals of the real means
for (i in 1:n){
  temp1<-matrix(NA,24,2)
  temp2<-matrix(NA,24,2)
  temp3<-matrix(NA,24,2)
  temp4<-matrix(NA,24,2)
  temp5<-matrix(NA,24,2)
  for (j in 1:24){
    ifelse(naives[j,i+1]>d_real$max[j] | naives[j,i+1]<d_real$min[j], temp1[j,1]<-0 ,temp1[j,1]<-1)
    ifelse(ie0[j,i+1]>d_real$max[j] | ie0[j,i+1]<d_real$min[j], temp2[j,1]<-0 ,temp2[j,1]<-1)
    ifelse(re0[j,i+1]>d_real$max[j] | re0[j,i+1]<d_real$min[j], temp3[j,1]<-0 ,temp3[j,1]<-1)
    ifelse(mrp0[j,i+1]>d_real$max[j] | mrp0[j,i+1]<d_real$min[j], temp4[j,1]<-0 ,temp4[j,1]<-1)
    ifelse(mrp[j,i+1]>d_real$max[j] | mrp[j,i+1]<d_real$min[j], temp5[j,1]<-0 ,temp5[j,1]<-1)
  
  cov.state [j,1]<-cov.state [j,1]+temp1[j,1]
  cov.state [j,2]<-cov.state [j,2]+temp2[j,1]
  cov.state [j,3]<-cov.state [j,3]+temp3[j,1]
  cov.state [j,4]<-cov.state [j,4]+temp4[j,1]
  cov.state [j,5]<-cov.state [j,5]+temp5[j,1]
  }
  coverage[i,1]<-sum(temp1[,1])
  coverage[i,2]<-sum(temp2[,1])
  coverage[i,3]<-sum(temp3[,1])
  coverage[i,4]<-sum(temp4[,1])
  coverage[i,5]<-sum(temp5[,1])
}
coverage
cov.state

##Collect deviations within each run
impro<-as.data.frame(matrix(NA,n,5))
colnames(impro)<-c("naive","ie","geo", "geoplus", "full")
for (i in 1:n){
impro[i,1]<-sum(abs(d_real$m_real-naives[,i+1]))/24
impro[i,2]<-sum(abs(d_real$m_real-ie0[,i+1]))/24
impro[i,3]<-sum(abs(d_real$m_real-re0[,i+1]))/24
impro[i,4]<-sum(abs(d_real$m_real-mrp0[,i+1]))/24
impro[i,5]<-sum(abs(d_real$m_real-mrp[,i+1]))/24
}

#Write the tables of results
write.table(coverage, "E:/gender_coverage_prop_noweight.txt")
write.table(cov.state, "E:/gender_covered_states_prop_noweight.txt")
write.table(results, "E:/gender_results_prop_noweight.txt")
write.table(effects, "E:/gender_effects_prop_noweight.txt")
write.table(impro, "E:/gender_impro_prop_noweight.txt")
write.table(mrp, "E:/gender_mrp_prop_noweight.txt")
write.table(re0, "E:/gender_re0_prop_noweight.txt")
write.table(mrp0, "E:/gender_mrp0_prop_noweight.txt")
write.table(naives, "E:/gender_naives_prop_noweight.txt")
write.table(ie0, "E:/gender_ie0_prop_noweight.txt")

#### Overviews
round(results[c(5,10,15,20),c(1,3)],digits=2)
summary(coverage)[4,][5]
round (sd(coverage[,5]), digits=2)
round(results[c(1,3,4,5),c(1,3)],digits=2)
round(results[c(11,13,14,15),c(1,3)],digits=2)
summary(coverage)[4,]
round(sd(coverage[,1]), digits=1)
round(sd(coverage[,3]), digits=1)
round(sd(coverage[,4]), digits=1)
round(sd(coverage[,5]), digits=1)

)
###Boxplots of correlations
p<-as.data.frame(cbind(
array(cor(d_real$wm_real,naives[,-1])),
array(cor(d_real$wm_real,re0[,-1])),
array(cor(d_real$wm_real,mrp0[,-1])),
array(cor(d_real$wm_real,mrp[,-1]))
))
colnames(p)<-c("naive","geo.re","geo.re+","full.model")
boxplot(p, horizontal=T, xlab="Correlation with the 'true' values")
pdf(file='E:/Q8_prop_boxplot.pdf', width=11,height=9)
boxplot(p, horizontal=T, xlab="Correlation with the 'true' values")
dev.off()


###The Figure
d_temp<-d_real
d_temp$m_real<-d_real$m_real+0.0001
d_real$index<-"A"
d_temp$index<-"B"
d_real$m<-d_real$m_real
d_temp$m<-rowMeans(mrp[-1])

for (i in 1:nrow(mrp)){
  d_temp$min[i]<-quantile(as.numeric(mrp[i,-1]), prob=0.05)
  d_temp$max[i]<-quantile(as.numeric(mrp[i,-1]), prob=0.95)
}

d_temp$geoID<-paste(sep=".", as.character(d_temp$geoID), "1")

d_empty<-d_temp
d_empty$geoID<-paste(sep=".", as.character(d_real$geoID), "2")
d_empty$m_real<-d_real$m_real-0.00001
d_empty$index<-"C"

dd<-rbind(d_real,d_temp, d_empty)
dd<-dd[order(-dd$m_real),]
dd$geoID <- factor(dd$geoID, as.character(dd$geoID))

plot1 <- ggplot(dd, aes(m, geoID, group=geoID, colour=index)) + geom_point(size = 3.5) + ylab("")+
  xlab('Gender equality') +
  geom_segment(aes(x=min, xend=max, yend=geoID))+theme_bw()
plot2<-plot1+scale_y_discrete(labels=dd$geoID[c(seq(2,72, by=3))], breaks=dd$geoID[c(seq(2,72, by=3))])+
  theme(legend.position=c(.8, 0.9)) +
  scale_x_continuous(labels=percent)+
  scale_colour_manual(name="State means", values = c("black","red", "white"), labels=c("  Real (full sample)", "  MRP-based", ""))

pdf(file='E:/Q8_prop_noweight.pdf', width=11,height=9)
plot2
dev.off()
